airq_data <- read.table('C:/AirQualityUCI.csv', sep = ';', header = TRUE, dec = ",")
airq_data <- airq_data %>% select_if(~sum(!is.na(.)) > 0) %>% drop_na()
airq_data[c('CO.GT.', 'C6H6.GT.', 'T', 'RH', 'AH')] <- sapply(airq_data[c('CO.GT.', 'C6H6.GT.', 'T', 'RH', 'AH')], as.numeric)
airq_data <- airq_data[, c(3:15)]
airq_data <- sapply(airq_data, function(x){ifelse (x == -200, NA, x)})
airq_data <- as.data.frame(airq_data)
cor_data <- cor(airq_data, method = 'spearman', use = 'pairwise.complete.obs')
corrplot(cor_data, method = 'circle')
• Exploring different models
first_model <- lm(PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S1.CO. + PT08.S5.O3., data = airq_data, na.action = na.exclude)
summary(first_model)
##
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S1.CO. + PT08.S5.O3.,
## data = airq_data, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -306.67 -97.01 -24.56 64.68 1635.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.547e+03 1.064e+01 145.413 < 2e-16 ***
## PT08.S2.NMHC. -3.705e-01 1.397e-02 -26.528 < 2e-16 ***
## PT08.S1.CO. -1.031e-01 1.860e-02 -5.543 3.05e-08 ***
## PT08.S5.O3. -2.444e-01 9.627e-03 -25.386 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 146.2 on 8987 degrees of freedom
## (366 observations deleted due to missingness)
## Multiple R-squared: 0.676, Adjusted R-squared: 0.6759
## F-statistic: 6251 on 3 and 8987 DF, p-value: < 2.2e-16
vif(first_model)
## PT08.S2.NMHC. PT08.S1.CO. PT08.S5.O3.
## 5.841369 6.860634 6.189541
Looking good! Let’s check the residuals:
plot(first_model, which = c(1,2))
residuals(first_model) %>% hist()
second_model <- lm(PT08.S3.NOx. ~ PT08.S2.NMHC. * PT08.S1.CO. * PT08.S5.O3., data = airq_data, na.action = na.exclude)
summary(second_model)
##
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S2.NMHC. * PT08.S1.CO. * PT08.S5.O3.,
## data = airq_data, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -257.33 -80.69 -25.34 56.14 1673.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.651e+03 5.933e+01 61.54 <2e-16
## PT08.S2.NMHC. -2.337e+00 7.511e-02 -31.11 <2e-16
## PT08.S1.CO. -1.945e+00 7.432e-02 -26.17 <2e-16
## PT08.S5.O3. -1.586e+00 5.475e-02 -28.96 <2e-16
## PT08.S2.NMHC.:PT08.S1.CO. 1.628e-03 7.397e-05 22.02 <2e-16
## PT08.S2.NMHC.:PT08.S5.O3. 1.209e-03 4.696e-05 25.75 <2e-16
## PT08.S1.CO.:PT08.S5.O3. 1.037e-03 5.128e-05 20.21 <2e-16
## PT08.S2.NMHC.:PT08.S1.CO.:PT08.S5.O3. -8.839e-07 3.398e-08 -26.01 <2e-16
##
## (Intercept) ***
## PT08.S2.NMHC. ***
## PT08.S1.CO. ***
## PT08.S5.O3. ***
## PT08.S2.NMHC.:PT08.S1.CO. ***
## PT08.S2.NMHC.:PT08.S5.O3. ***
## PT08.S1.CO.:PT08.S5.O3. ***
## PT08.S2.NMHC.:PT08.S1.CO.:PT08.S5.O3. ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 125.6 on 8983 degrees of freedom
## (366 observations deleted due to missingness)
## Multiple R-squared: 0.761, Adjusted R-squared: 0.7608
## F-statistic: 4086 on 7 and 8983 DF, p-value: < 2.2e-16
vif(second_model)
## PT08.S2.NMHC. PT08.S1.CO.
## 228.9104 148.3410
## PT08.S5.O3. PT08.S2.NMHC.:PT08.S1.CO.
## 271.3158 854.6555
## PT08.S2.NMHC.:PT08.S5.O3. PT08.S1.CO.:PT08.S5.O3.
## 607.5205 749.8477
## PT08.S2.NMHC.:PT08.S1.CO.:PT08.S5.O3.
## 865.0315
R^2 is bigger, but now there is a multicollinearity
third_model <- lm(PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S5.O3., data = airq_data, na.action = na.exclude)
summary(third_model)
##
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S5.O3., data = airq_data,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -321.44 -96.31 -22.02 65.38 1647.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.498e+03 5.911e+00 253.37 <2e-16 ***
## PT08.S2.NMHC. -4.083e-01 1.221e-02 -33.42 <2e-16 ***
## PT08.S5.O3. -2.727e-01 8.179e-03 -33.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 146.4 on 8988 degrees of freedom
## (366 observations deleted due to missingness)
## Multiple R-squared: 0.6749, Adjusted R-squared: 0.6749
## F-statistic: 9331 on 2 and 8988 DF, p-value: < 2.2e-16
vif(third_model)
## PT08.S2.NMHC. PT08.S5.O3.
## 4.4527 4.4527
plot(third_model, which = c(1,2))
residuals(third_model) %>% hist()
Looks good, R^2 is the same as with 3 predictors.
summary(fourth_model)
##
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S5.O3. + PT08.S2.NMHC. + NOx.GT. +
## AH + PT08.S4.NO2. + PT08.S1.CO. + NO2.GT. + C6H6.GT. + CO.GT. +
## PT08.S5.O3.:PT08.S2.NMHC. + PT08.S2.NMHC.:NO2.GT. + AH:PT08.S1.CO. +
## PT08.S2.NMHC.:PT08.S4.NO2. + NOx.GT.:PT08.S1.CO. + PT08.S2.NMHC.:PT08.S1.CO. +
## PT08.S2.NMHC.:AH + PT08.S4.NO2.:PT08.S1.CO. + PT08.S5.O3.:PT08.S4.NO2. +
## NOx.GT.:AH + PT08.S5.O3.:C6H6.GT. + PT08.S4.NO2.:C6H6.GT. +
## PT08.S4.NO2.:NO2.GT. + PT08.S1.CO.:C6H6.GT. + AH:C6H6.GT. +
## AH:NO2.GT. + PT08.S5.O3.:AH + NOx.GT.:C6H6.GT. + NOx.GT.:PT08.S4.NO2. +
## PT08.S2.NMHC.:NOx.GT. + NO2.GT.:C6H6.GT. + PT08.S2.NMHC.:C6H6.GT. +
## PT08.S5.O3.:NO2.GT. + PT08.S1.CO.:NO2.GT. + PT08.S5.O3.:PT08.S1.CO. +
## PT08.S4.NO2.:CO.GT. + NOx.GT.:NO2.GT. + PT08.S5.O3.:NOx.GT. +
## C6H6.GT.:CO.GT. + NO2.GT.:CO.GT. + AH:CO.GT. + NOx.GT.:CO.GT. +
## AH:PT08.S4.NO2. + PT08.S1.CO.:CO.GT. + PT08.S2.NMHC.:CO.GT.,
## data = airq_data_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -229.20 -43.39 -11.00 24.46 1633.78
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.472e+03 1.419e+02 31.509 < 2e-16 ***
## PT08.S5.O3. -5.576e-01 1.246e-01 -4.475 7.78e-06 ***
## PT08.S2.NMHC. -2.801e+00 2.697e-01 -10.389 < 2e-16 ***
## NOx.GT. 1.438e+00 2.730e-01 5.269 1.41e-07 ***
## AH -6.798e+02 8.962e+01 -7.586 3.73e-14 ***
## PT08.S4.NO2. 1.198e+00 1.677e-01 7.142 1.01e-12 ***
## PT08.S1.CO. -3.952e+00 2.169e-01 -18.222 < 2e-16 ***
## NO2.GT. -4.788e+00 8.511e-01 -5.626 1.92e-08 ***
## C6H6.GT. 2.643e+02 1.892e+01 13.970 < 2e-16 ***
## CO.GT. 1.858e+02 3.437e+01 5.406 6.64e-08 ***
## PT08.S5.O3.:PT08.S2.NMHC. 9.906e-04 2.215e-04 4.472 7.89e-06 ***
## PT08.S2.NMHC.:NO2.GT. 1.038e-03 1.309e-03 0.793 0.427754
## AH:PT08.S1.CO. -9.439e-01 7.970e-02 -11.843 < 2e-16 ***
## PT08.S2.NMHC.:PT08.S4.NO2. -4.514e-03 2.760e-04 -16.356 < 2e-16 ***
## NOx.GT.:PT08.S1.CO. 3.087e-04 1.769e-04 1.745 0.080977 .
## PT08.S2.NMHC.:PT08.S1.CO. 2.284e-03 3.896e-04 5.862 4.77e-09 ***
## PT08.S2.NMHC.:AH 1.681e+00 1.357e-01 12.388 < 2e-16 ***
## PT08.S4.NO2.:PT08.S1.CO. 2.580e-03 1.532e-04 16.837 < 2e-16 ***
## PT08.S5.O3.:PT08.S4.NO2. -6.974e-04 8.413e-05 -8.290 < 2e-16 ***
## NOx.GT.:AH 8.090e-01 7.445e-02 10.867 < 2e-16 ***
## PT08.S5.O3.:C6H6.GT. -5.509e-03 7.492e-03 -0.735 0.462179
## PT08.S4.NO2.:C6H6.GT. 1.142e-01 8.520e-03 13.405 < 2e-16 ***
## PT08.S4.NO2.:NO2.GT. 6.990e-03 5.469e-04 12.782 < 2e-16 ***
## PT08.S1.CO.:C6H6.GT. -1.577e-01 1.277e-02 -12.346 < 2e-16 ***
## AH:C6H6.GT. -4.332e+01 4.967e+00 -8.720 < 2e-16 ***
## AH:NO2.GT. -2.434e+00 2.431e-01 -10.013 < 2e-16 ***
## PT08.S5.O3.:AH 2.008e-01 4.027e-02 4.986 6.32e-07 ***
## NOx.GT.:C6H6.GT. 7.997e-02 1.156e-02 6.920 4.93e-12 ***
## NOx.GT.:PT08.S4.NO2. -1.088e-03 1.423e-04 -7.645 2.37e-14 ***
## PT08.S2.NMHC.:NOx.GT. -1.857e-03 4.105e-04 -4.523 6.20e-06 ***
## NO2.GT.:C6H6.GT. -1.512e-01 4.207e-02 -3.594 0.000328 ***
## PT08.S2.NMHC.:C6H6.GT. -4.924e-02 7.449e-03 -6.610 4.13e-11 ***
## PT08.S5.O3.:NO2.GT. 1.561e-03 3.359e-04 4.647 3.43e-06 ***
## PT08.S1.CO.:NO2.GT. -3.470e-03 7.353e-04 -4.720 2.41e-06 ***
## PT08.S5.O3.:PT08.S1.CO. 2.528e-04 6.636e-05 3.809 0.000140 ***
## PT08.S4.NO2.:CO.GT. -1.367e-01 2.416e-02 -5.658 1.60e-08 ***
## NOx.GT.:NO2.GT. 2.454e-03 4.560e-04 5.381 7.64e-08 ***
## PT08.S5.O3.:NOx.GT. -3.739e-04 9.185e-05 -4.071 4.74e-05 ***
## C6H6.GT.:CO.GT. 6.006e+00 1.622e+00 3.704 0.000214 ***
## NO2.GT.:CO.GT. -4.678e-01 1.082e-01 -4.324 1.55e-05 ***
## AH:CO.GT. 4.907e+01 1.526e+01 3.215 0.001309 **
## NOx.GT.:CO.GT. 3.801e-02 2.044e-02 1.860 0.062956 .
## AH:PT08.S4.NO2. 4.937e-02 2.667e-02 1.851 0.064202 .
## PT08.S1.CO.:CO.GT. 5.557e-02 2.518e-02 2.207 0.027342 *
## PT08.S2.NMHC.:CO.GT. -1.116e-01 6.346e-02 -1.758 0.078805 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93.8 on 6896 degrees of freedom
## Multiple R-squared: 0.8622, Adjusted R-squared: 0.8613
## F-statistic: 980.7 on 44 and 6896 DF, p-value: < 2.2e-16
vif(fourth_model)
## PT08.S5.O3. PT08.S2.NMHC.
## 2024.1919 3998.8721
## NOx.GT. AH
## 2558.3458 1019.1196
## PT08.S4.NO2. PT08.S1.CO.
## 2769.3461 1775.4654
## NO2.GT. C6H6.GT.
## 1287.6663 15737.0072
## CO.GT. PT08.S5.O3.:PT08.S2.NMHC.
## 1935.2479 19480.9209
## PT08.S2.NMHC.:NO2.GT. AH:PT08.S1.CO.
## 7071.4440 1361.6993
## PT08.S2.NMHC.:PT08.S4.NO2. NOx.GT.:PT08.S1.CO.
## 31706.9947 2566.8099
## PT08.S2.NMHC.:PT08.S1.CO. PT08.S2.NMHC.:AH
## 33298.0434 3779.1217
## PT08.S4.NO2.:PT08.S1.CO. PT08.S5.O3.:PT08.S4.NO2.
## 8868.3781 4645.5430
## NOx.GT.:AH PT08.S5.O3.:C6H6.GT.
## 211.9693 9578.6010
## PT08.S4.NO2.:C6H6.GT. PT08.S4.NO2.:NO2.GT.
## 15057.8480 1773.1004
## PT08.S1.CO.:C6H6.GT. AH:C6H6.GT.
## 18882.8108 1858.7029
## AH:NO2.GT. PT08.S5.O3.:AH
## 122.2585 457.8575
## NOx.GT.:C6H6.GT. NOx.GT.:PT08.S4.NO2.
## 3687.7470 2254.8223
## PT08.S2.NMHC.:NOx.GT. NO2.GT.:C6H6.GT.
## 12591.6935 2831.2905
## PT08.S2.NMHC.:C6H6.GT. PT08.S5.O3.:NO2.GT.
## 6380.6342 864.0077
## PT08.S1.CO.:NO2.GT. PT08.S5.O3.:PT08.S1.CO.
## 2375.7489 1840.3849
## PT08.S4.NO2.:CO.GT. NOx.GT.:NO2.GT.
## 4347.9094 309.5329
## PT08.S5.O3.:NOx.GT. C6H6.GT.:CO.GT.
## 1082.2351 4614.8798
## NO2.GT.:CO.GT. AH:CO.GT.
## 784.4240 595.1000
## NOx.GT.:CO.GT. AH:PT08.S4.NO2.
## 493.1602 400.2981
## PT08.S1.CO.:CO.GT. PT08.S2.NMHC.:CO.GT.
## 2840.9688 17118.7947
plot(fourth_model, which = c(1,2))
residuals(fourth_model) %>% hist()
R^2 is bigger, but there is a lot of multicollinearity and the residuals looking not very good.
• So, I decided to choose the third model (with 2 predictors) as the best one. Now, I want to check if the transformation will improve the model.
R <- NULL
for (i in -5:5){
one <- airq_data$PT08.S2.NMHC.
two <- airq_data$PT08.S5.O3.
if (i < 0){
one <- one^(i) *(-1)
two <- two^(i) *(-1)
}
else if (i == 0) {
one <- log(one)
two <- log(two)
}
else {
one <- one^i
two <- two^i
}
model <- lm(PT08.S3.NOx. ~ one + two, data = airq_data, na.action = na.exclude)
R <- c(R, summary(model)$adj.r.squared)
}
transform <- data.frame('p' = c(-5:5), 'R^2' = R)
plot(transform, type = 'b')
transform[max(transform),]
## p R.2
## 5 -1 0.7488992
• Making the best model and predicting the values
airq_data$PT08.S2.NMHC. <- (airq_data$PT08.S2.NMHC.)^-1 * (-1)
airq_data$PT08.S5.O3. <- (airq_data$PT08.S5.O3.)^-1 * (-1)
final_model <- lm(PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S5.O3., data = airq_data, na.action = na.exclude)
summary(final_model)
##
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S5.O3., data = airq_data,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.50 -80.94 -20.36 65.08 1718.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.665e+02 5.195e+00 32.04 <2e-16 ***
## PT08.S2.NMHC. -3.891e+05 7.947e+03 -48.96 <2e-16 ***
## PT08.S5.O3. -1.906e+05 5.208e+03 -36.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 128.7 on 8988 degrees of freedom
## (366 observations deleted due to missingness)
## Multiple R-squared: 0.749, Adjusted R-squared: 0.7489
## F-statistic: 1.341e+04 on 2 and 8988 DF, p-value: < 2.2e-16
vif(final_model)
## PT08.S2.NMHC. PT08.S5.O3.
## 3.926512 3.926512
plot(final_model, which = c(1,2))
residuals(final_model) %>% hist()
• Prediction:
test_subset_multiple <- airq_data[which(row.names(airq_data) %in% sample(row.names(airq_data), 50, replace = FALSE)), c(5,10,7)]
test_multiple <- data.frame(PT08.S2.NMHC. = test_subset_multiple$PT08.S2.NMHC., PT08.S5.O3. = test_subset_multiple$PT08.S5.O3.)
test_subset_multiple$pred_PT08.S3.NOx. <- predict(final_model, newdata = test_multiple)
colnames(test_subset_multiple) <- c('real_PT08.S2.NMHC.','real_PT08.S5.O3.', 'real_PT08.S3.NOx.', 'pred_PT08.S3.NOx.')
df <- data.frame('y' = airq_data$PT08.S3.NOx. , 'x1' = airq_data$PT08.S2.NMHC., 'x2' = airq_data$PT08.S5.O3.)
### Estimation of the regression plane
mod <- lm(y ~ x1+x2, data = df)
cf.mod <- coef(mod)
df <- df %>% drop_na()
### Calculate z on a grid of x-y values
PT08.S2.NMHC. <- seq(min(df$x1),max(df$x1),length.out=25)
PT08.S5.O3. <- seq(min(df$x2),max(df$x2),length.out=25)
z <- t(outer(PT08.S2.NMHC., PT08.S5.O3., function(x,y) cf.mod[1]+cf.mod[2]*x+cf.mod[3]*y))
#### Draw the plane with "plot_ly" and add points with "add_trace"
R <- round(summary(final_model)$adj.r.squared, digits = 3)
p <- round(summary(final_model)$coefficients[2,4], digits = 3)
titl <- paste('R^2 =', as.character(R),', p-val =', as.character(p))
plot_ly(x=~PT08.S2.NMHC., y=~PT08.S5.O3., z=~z, type="surface", opacity=0.7) %>%
add_trace(data = test_subset_multiple, x = ~ real_PT08.S2.NMHC., y = ~ real_PT08.S5.O3.,
z = ~ pred_PT08.S3.NOx., type="scatter3d", size = 1, name= "Predicted", opacity=0.8) %>%
add_trace(data = test_subset_multiple, x = ~ real_PT08.S2.NMHC., y = ~ real_PT08.S5.O3.,
z = ~ real_PT08.S3.NOx., type="scatter3d", size = 1, name="Real", opacity=0.8) %>%
layout(title = titl)
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: Ignoring 1 observations
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: Ignoring 1 observations